5 Metagenomic data
Species-specific community analyses conducted to generate the data included in these analyses can be found in the annex.
5.1 Library complexity
Nonpareil estimate of the metagenomic complexity after removing host DNA.
all_data %>%
select(dataset,Extraction,nonpareil,Taxon) %>%
unique() %>%
group_by(Taxon,Extraction) %>%
summarise(value = sprintf("%.1f±%.1f", mean(nonpareil), sd(nonpareil))) %>%
pivot_wider(names_from = Extraction, values_from = value) %>%
tt(caption = "Mean and standard deviation of breadth of host genome coverage")| Taxon | ZYMO | DREX | EHEX |
|---|---|---|---|
| Amphibian | 0.9±0.1 | 0.8±0.1 | 0.8±0.1 |
| Reptile | 0.9±0.1 | 0.9±0.0 | 0.9±0.1 |
| Mammal | 0.8±0.2 | 0.8±0.1 | 0.7±0.3 |
| Bird | 0.9±0.1 | 1.0±0.0 | 0.8±0.4 |
| Control | 0.0±0.0 | 0.5±0.6 | 0.5±0.7 |
all_data %>%
select(dataset,Extraction,nonpareil,Taxon,Species) %>%
unique() %>%
ggplot(aes(x=Extraction,y=nonpareil, color=Species, group=Extraction)) +
geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") +
geom_jitter() +
scale_color_manual(values=vertebrate_colors) +
facet_grid(. ~ Taxon, scales = "free") +
theme_minimal() +
labs(y="Nonpareil completeness",x="Extraction method")all_data %>%
select(dataset,Extraction,Sample,Species,nonpareil,Taxon) %>%
unique() %>%
mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
filter(Taxon != "Control") %>%
lmerTest::lmer(nonpareil ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 0.885552853 | 0.03450229 | 25.6664948 | 40.89542 | 7.753071e-27 |
| fixed | NA | ExtractionDREX | 0.005536892 | 0.04336611 | 0.1276779 | 48.00020 | 8.989373e-01 |
| fixed | NA | ExtractionEHEX | -0.112193866 | 0.04336611 | -2.5871326 | 48.00020 | 1.276294e-02 |
| ran_pars | Sample | sd__(Intercept) | 0.059565487 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 0.035030813 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 0.150224598 | NA | NA | NA | NA |
5.2 Alpha diversity
Variance partitioning metrics are derived from community_analysis.Rmd.
alpha_data <- list.files(path = "results", pattern = "^alpha_.*\\.tsv$", full.names = TRUE) %>%
map_df(~ read_tsv(.)) %>%
left_join(all_data,by= join_by(dataset==dataset))alpha_data %>%
pivot_longer(!c(dataset,Library,Species,Taxon,Sample,Extraction), names_to = "metric", values_to = "value") %>%
filter(metric %in% c("richness","neutral","phylogenetic")) %>%
mutate(metric=factor(metric,levels=c("richness","neutral","phylogenetic"))) %>%
mutate(Taxon=factor(Taxon,levels=c("Amphibian","Reptile","Mammal"))) %>%
mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
unique() %>%
ggplot(aes(x=Extraction,y=value, color=Species, group=Extraction)) +
geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") +
geom_jitter() +
scale_color_manual(values=vertebrate_colors) +
facet_grid(metric ~ Taxon, scales = "free") +
theme_minimal() +
labs(y="Diversity",x="Extraction method")5.2.1 Richness
alpha_data %>%
select(dataset,Extraction,Sample,Species,richness,Taxon) %>%
unique() %>%
mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
lmerTest::lmer(richness ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 59.44444444 | 10.715517 | 5.5475108 | 10.12322 | 0.000234351 |
| fixed | NA | ExtractionDREX | 0.05555556 | 4.453352 | 0.0124750 | 34.97883 | 0.990117532 |
| fixed | NA | ExtractionEHEX | 1.38257472 | 4.545332 | 0.3041746 | 35.13134 | 0.762789322 |
| ran_pars | Sample | sd__(Intercept) | 16.00359332 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 28.56742258 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 13.36005511 | NA | NA | NA | NA |
5.2.2 Neutral
alpha_data %>%
select(dataset,Extraction,Sample,Species,neutral,Taxon) %>%
unique() %>%
mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
lmerTest::lmer(neutral ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 29.5413181 | 5.139162 | 5.7482754 | 9.895197 | 0.000193227 |
| fixed | NA | ExtractionDREX | -2.1102758 | 1.922343 | -1.0977625 | 35.042971 | 0.279794105 |
| fixed | NA | ExtractionEHEX | -0.3422373 | 1.962694 | -0.1743712 | 35.152526 | 0.862574163 |
| ran_pars | Sample | sd__(Intercept) | 8.6429526 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 13.5543072 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 5.7670282 | NA | NA | NA | NA |
5.2.3 Phylogenetic
alpha_data %>%
select(dataset,Extraction,Sample,Species,phylogenetic,Taxon) %>%
unique() %>%
mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
lmerTest::lmer(phylogenetic ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 4.7721984 | 0.4324985 | 11.034024 | 9.653749 | 8.685117e-07 |
| fixed | NA | ExtractionDREX | 0.1521100 | 0.1403528 | 1.083769 | 35.015284 | 2.858737e-01 |
| fixed | NA | ExtractionEHEX | 0.1852709 | 0.1433723 | 1.292236 | 35.056809 | 2.047290e-01 |
| ran_pars | Sample | sd__(Intercept) | 1.2179889 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 0.9236345 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 0.4210584 | NA | NA | NA | NA |
5.3 Microbial complexity recovery
all_data %>%
select(dataset,Extraction,microbial_fraction,MAG_mapping_percentage,Taxon) %>%
mutate(damr=pmin(1,MAG_mapping_percentage/(microbial_fraction*100))) %>%
mutate(damr=ifelse(is.na(damr),0,damr)) %>%
unique() %>%
group_by(Taxon,Extraction) %>%
summarise(value = sprintf("%.1f±%.1f", mean(damr), sd(damr))) %>%
pivot_wider(names_from = Extraction, values_from = value) %>%
tt(caption = "Mean and standard deviation of breadth of host genome coverage")| Taxon | ZYMO | DREX | EHEX |
|---|---|---|---|
| Amphibian | 0.8±0.4 | 0.8±0.3 | 0.8±0.3 |
| Reptile | 0.9±0.1 | 1.0±0.0 | 1.0±0.0 |
| Mammal | 0.8±0.1 | 0.9±0.1 | 0.8±0.3 |
| Bird | 0.6±0.4 | 0.5±0.4 | 0.6±0.4 |
| Control | 1.0±0.0 | 1.0±0.0 | 1.0±0.0 |
all_data %>%
select(dataset,Extraction,microbial_fraction,MAG_mapping_percentage,Taxon,Sample,Species) %>%
mutate(damr=pmin(1,MAG_mapping_percentage/(microbial_fraction*100))) %>%
mutate(damr=ifelse(is.na(damr),0,damr)) %>%
unique() %>%
ggplot(aes(x=Extraction,y=damr, color=Species, group=Extraction)) +
geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") +
geom_jitter() +
scale_color_manual(values=vertebrate_colors) +
facet_grid(. ~ Taxon, scales = "free") +
theme_minimal() +
labs(y="Domain-adjusted mapping rate",x="Extraction method")all_data %>%
select(dataset,Extraction,microbial_fraction,MAG_mapping_percentage,Taxon, Sample, Species) %>%
mutate(damr=pmin(1,MAG_mapping_percentage/(microbial_fraction*100))) %>%
mutate(damr=ifelse(is.na(damr),0,damr)) %>%
unique() %>%
mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
filter(Taxon != "Control") %>%
lmerTest::lmer(damr ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 0.78271978 | 0.06660971 | 11.7508366 | 15.45687 | 4.117675e-09 |
| fixed | NA | ExtractionDREX | 0.03348421 | 0.03977983 | 0.8417385 | 130.19072 | 4.014779e-01 |
| fixed | NA | ExtractionEHEX | 0.02228400 | 0.03999367 | 0.5571881 | 130.24817 | 5.783552e-01 |
| ran_pars | Sample | sd__(Intercept) | 0.12024342 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 0.19047224 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 0.20283813 | NA | NA | NA | NA |
5.4 Variance partitioning
Variance partitioning metrics are derived from community_analysis.Rmd.
variance_data <- list.files(path = "results", pattern = "^var_.*\\.tsv$", full.names = TRUE) %>%
map_df(~ read_tsv(.))
variance_data %>% summarise(mean=mean(r2),sd=sd(r2))# A tibble: 1 × 2
mean sd
<dbl> <dbl>
1 0.102 0.0768
variance_data %>%
left_join(sample_metadata %>% select(Species,Taxon) %>% unique(),by=join_by(species==Species)) %>%
mutate(metric=factor(metric,levels=c("richness","neutral","phylogenetic"))) %>%
mutate(Taxon=factor(Taxon,levels=c("Amphibian","Reptile","Mammal"))) %>%
ggplot(aes(x=Taxon,y=r2)) +
geom_boxplot()+
ylim(0,1)+
facet_grid(. ~ metric, scales = "free")| metric | mean | sd |
|---|---|---|
| neutral | 0.06201935 | 0.04744380 |
| phylogenetic | 0.07381277 | 0.06884687 |
| richness | 0.16970818 | 0.06626388 |
5.5 Combined community analysis
species="combined"
genus=species
sample_metadata <- read_tsv(paste0("data/metadata.tsv")) %>%
rename(dataset=Dataset)
read_counts <- read_tsv("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134_counts.tsv.gz") %>%
rename(genome = 1)
genome_metadata <- read_tsv("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134_mag_info.tsv.gz") %>%
rename(genome = 1, length=mag_size)
genome_coverage <- read_tsv("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134_coverage.tsv.gz") %>%
rename(genome = 1)
download.file("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134.tree.gz", "data/DMB0134.tree.gz")
genome_tree <- read_tree("data/DMB0134.tree.gz")5.5.1 Filter data
#Filter by coverage
min_coverage=0.3
read_counts_filt <- genome_coverage %>%
mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
mutate(across(-1, ~ . * read_counts[[cur_column()]]))
# Transform to genome counts (non-filtered)
readlength=150
genome_counts <- read_counts %>%
mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))
# Transform to genome counts (coverage-filtered)
readlength=150
genome_counts_filt <- read_counts_filt %>%
mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))5.5.2 Community barplot
# Retrieve EHI taxonomy colors
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
select(colors) %>%
pull()
# Stacked barplot
genome_counts_filt %>%
mutate_at(vars(-genome), funs(./sum(., na.rm = TRUE))) %>% #apply TSS nornalisation
pivot_longer(-genome, names_to = "dataset", values_to = "count") %>%
left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append taxonomy
left_join(., sample_metadata, by = join_by(dataset == dataset)) %>% #append taxonomy
mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
filter(Taxon != "Control") %>%
ggplot(., aes(x=dataset,y=count,fill=phylum, group=phylum))+ #grouping enables keeping the same sorting of taxonomic units
geom_bar(stat="identity", colour="white", linewidth=0.1)+ #plot stacked bars with white borders
scale_fill_manual(values=phylum_colors) +
labs(y = "Relative abundance") +
facet_nested(. ~ Taxon + Species + Sample + Extraction, scales="free_x") +
guides(fill = guide_legend(ncol = 3)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.title.x = element_blank(),
panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black"),
legend.position="none",
legend.title=element_blank(),
panel.spacing = unit(0, "lines"))genome_counts_NMDS <- genome_counts_filt %>%
column_to_rownames(var="genome") %>%
select(where(~!all(. == 0))) %>%
hillpair(.,q=1, metric="C", out="dist") %>%
metaMDS(.,trymax = 999, k=2, trace=0) %>%
vegan::scores() %>%
as_tibble(., rownames = "dataset") %>%
left_join(sample_metadata, by = join_by(dataset == dataset)) %>%
filter(Taxon != "Control") %>%
group_by(Sample) %>%
mutate(sample_x=mean(NMDS1), sample_y=mean(NMDS2))
genome_counts_NMDS %>%
ggplot(., aes(x=NMDS1,y=NMDS2, color=Species, shape=Extraction)) +
scale_color_manual(values=vertebrate_colors) +
geom_point(size=3, alpha=0.8) +
geom_segment(aes(x=sample_x, y=sample_y, xend=NMDS1, yend=NMDS2), alpha=0.2) +
theme_classic() +
theme(legend.position="right", legend.box="vertical") +
guides(color=guide_legend(title="Species"))